suppressPackageStartupMessages(library(tidyverse))
devtools::load_all(
  '~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities

Settings

wd <- "~/Google Drive/My Drive/Analysis/METTL2A/"
setwd(wd)

figdir <- paste0(wd, 'Figures/DRS_diffthresh/')
tabledir <- paste0(wd, 'Tables/DRS_diffthresh/')

theme_set(
  theme_classic(base_size = 7) +
    theme(legend.position = 'bottom')
)

Functions

filter_KS_intensity_increase_pval_diff_median <- 
  function(pval, diff_med) {
    sampcomp_results_joined |> 
      filter(
        KS_intensity_pvalue_G < pval &
          KS_intensity_pvalue_I < pval 
      ) |> 
      filter(
        c2_median_intensity_G - c1_median_intensity_G > diff_med &
          c2_median_intensity_I - c1_median_intensity_I > diff_med
      )
  }

calc_midC_percentage <- function(df) {
  df |> 
    mutate(mid_base = str_sub(ref_kmer, 3,3)) |> 
    group_by(mid_base) |> 
    reframe(n = n()) |> 
    mutate(percent = 100 * n / sum(n)) 
}


paste_wd <- function(path) {
  
  paste0(wd, path)
  
}

Read data

sampcomp_results_joined <- 
  read_tsv(
    'Tables/DRS/Positions/sampcomp_results_joined_2024-04-09.tsv.gz' |> 
      paste_wd()
  )
## Rows: 5884004 Columns: 67
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (34): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (33): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sampcomp_results_joined
## # A tibble: 5,884,004 × 67
##    transcript_id     transcript_name position ref_kmer GMM_logit_pvalue_G
##    <chr>             <chr>              <dbl> <chr>                 <dbl>
##  1 ENST00000264926.7 RAD18-201           1464 TCACA                    NA
##  2 ENST00000264926.7 RAD18-201           1465 CACAT                     1
##  3 ENST00000264926.7 RAD18-201           1466 ACATA                    NA
##  4 ENST00000264926.7 RAD18-201           1467 CATAA                     1
##  5 ENST00000264926.7 RAD18-201           1468 ATAAA                    NA
##  6 ENST00000264926.7 RAD18-201           1473 AACGA                     1
##  7 ENST00000264926.7 RAD18-201           1475 CGATC                    NA
##  8 ENST00000264926.7 RAD18-201           1486 ACACA                    NA
##  9 ENST00000264926.7 RAD18-201           1501 CAAGA                     1
## 10 ENST00000264926.7 RAD18-201           1502 AAGAC                    NA
## # ℹ 5,883,994 more rows
## # ℹ 62 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## #   GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## #   Logit_LOR_G <chr>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## #   c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## #   c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## #   c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …

Check previous results

pval_thresh <- .05
dif_med_intensity <- 0
sampcomp_results_joined |> 
  filter(
    KS_intensity_pvalue_G < pval_thresh &
      KS_intensity_pvalue_I < pval_thresh 
  ) |> 
  filter(
    c2_median_intensity_G - c1_median_intensity_G > dif_med_intensity &
      c2_median_intensity_I - c1_median_intensity_I > dif_med_intensity
  )
## # A tibble: 605 × 67
##    transcript_id     transcript_name position ref_kmer GMM_logit_pvalue_G
##    <chr>             <chr>              <dbl> <chr>                 <dbl>
##  1 ENST00000429711.7 RPL32-204            422 GCCCA                 1    
##  2 ENST00000647248.2 RPL35A-211           380 ACCCC                 1    
##  3 ENST00000647248.2 RPL35A-211           381 CCCCT                 1    
##  4 ENST00000389680.2 MT-RNR1-201           43 ACACA                 1    
##  5 ENST00000389680.2 MT-RNR1-201           57 CCCCG                 1    
##  6 ENST00000389680.2 MT-RNR1-201           71 GTTCA                 1    
##  7 ENST00000389680.2 MT-RNR1-201           73 TCACC                 1    
##  8 ENST00000389680.2 MT-RNR1-201           75 ACCCT                 0.777
##  9 ENST00000389680.2 MT-RNR1-201           93 ATCAA                 1    
## 10 ENST00000389680.2 MT-RNR1-201          138 GCTTA                 1    
## # ℹ 595 more rows
## # ℹ 62 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## #   GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## #   Logit_LOR_G <chr>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## #   c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## #   c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## #   c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …

Try different combination

params <- 
  expand_grid(
    pval = c(.001, .01, .05, .1),
    diff_med = c(0, .01, .02, .03, .04, .05, .1, .2, 1)
  )
params
## # A tibble: 36 × 2
##     pval diff_med
##    <dbl>    <dbl>
##  1 0.001     0   
##  2 0.001     0.01
##  3 0.001     0.02
##  4 0.001     0.03
##  5 0.001     0.04
##  6 0.001     0.05
##  7 0.001     0.1 
##  8 0.001     0.2 
##  9 0.001     1   
## 10 0.01      0   
## # ℹ 26 more rows
midbase_percent_in_different_threshold <- 
  params |> 
  pmap_dfr(function(pval, diff_med) {
    
    data <- filter_KS_intensity_increase_pval_diff_median(pval = pval, diff_med = diff_med) |>
      calc_midC_percentage()
    
    tibble(
      pval = pval,
      diff_med = diff_med,
      result = list(data)
    ) 
  }) |> 
  unnest(result)
midbase_percent_in_different_threshold  |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_diffthresh/midbase_percent_in_different_threshold_2025-07-15.tsv
## # A tibble: 144 × 5
##     pval diff_med mid_base     n percent
##    <dbl>    <dbl> <chr>    <int>   <dbl>
##  1 0.001     0    A           26  12.6  
##  2 0.001     0    C          159  77.2  
##  3 0.001     0    G            2   0.971
##  4 0.001     0    T           19   9.22 
##  5 0.001     0.01 A           26  12.6  
##  6 0.001     0.01 C          159  77.2  
##  7 0.001     0.01 G            2   0.971
##  8 0.001     0.01 T           19   9.22 
##  9 0.001     0.02 A           26  12.6  
## 10 0.001     0.02 C          159  77.2  
## # ℹ 134 more rows

Plot

midbase_percent_in_different_threshold_heatmap <- 
  midbase_percent_in_different_threshold |> 
  filter(mid_base == 'C') |> 
  ggplot(aes(
    x = pval |> as_factor(), y = diff_med |> as_factor(), 
    fill = percent, 
    label = scales::label_number(accuracy = .1)(percent))) +
  geom_tile() +
  geom_text()
midbase_percent_in_different_threshold_heatmap |> 
  ggsave_pdf(outdir = figdir, width = 8, height = 8)

midbase_percent_in_different_threshold_num_barplot <- 
  midbase_percent_in_different_threshold |> 
  filter(mid_base == 'C') |> 
  ggplot(aes(
    x = paste('p < ', pval, ', Δmedian > ',diff_med), 
    y = n
  )) +
  labs(x = '') +
  geom_bar(stat = 'identity') +
  coord_flip() 
midbase_percent_in_different_threshold_num_barplot |> 
  ggsave_pdf(outdir = figdir, width = 8, height = 8)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0.01' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0.02' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0.03' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0.04' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0.05' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0.1' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0.2' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 1' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0.01' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0.02' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0.03' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0.04' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0.05' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0.1' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0.2' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 1' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0.01' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0.02' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0.03' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0.04' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0.05' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0.1' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0.2' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 1' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0.01' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0.02' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0.03' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0.04' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0.05' in 'mbcsToSbcs': for Δ
## (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0.1' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0.2' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 1' in 'mbcsToSbcs': for Δ (U+0394)

Revise supp Fig.

midbase_percent_in_different_threshold_Cpercent_barplot <- 
  midbase_percent_in_different_threshold |> 
  filter(diff_med == 0) |> 
  #filter(mid_base == 'C') |> 
  ggplot(aes(
    x = paste('p < ', pval, ', Δmedian > ',diff_med), 
    y = percent, 
    fill = reorder(mid_base, percent) 
  )) +
  geom_bar(stat = 'identity', position = position_stack()) +
  scale_fill_manual(values = c('#01C001', '#E6E602', '#5051FF', '#E00800')) +
  coord_flip()
midbase_percent_in_different_threshold_Cpercent_barplot |> 
  ggsave_pdf(outdir = figdir, width = 8, height = 4)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'paste("p < ", pval, ", Δmedian > ", diff_med)' in
## 'mbcsToSbcs': for Δ (U+0394)

midbase_percent_in_different_threshold_Cpercent_barplot_num <- 
  midbase_percent_in_different_threshold |> 
  filter(diff_med == 0) |> 
  #filter(mid_base == 'C') |> 
  ggplot(aes(
    x = paste('p < ', pval, ', Δmedian > ',diff_med), 
    y = n, 
    fill = reorder(mid_base, percent) 
  )) +
  geom_bar(stat = 'identity') +
  scale_fill_manual(values = c('#01C001', '#E6E602', '#5051FF', '#E00800')) +
  coord_flip()
midbase_percent_in_different_threshold_Cpercent_barplot_num |> 
  ggsave_pdf(outdir = figdir, width = 8, height = 4)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.001 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.01 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.05 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'p < 0.1 , Δmedian > 0' in 'mbcsToSbcs': for Δ (U+0394)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'paste("p < ", pval, ", Δmedian > ", diff_med)' in
## 'mbcsToSbcs': for Δ (U+0394)

midbase_percent_in_different_threshold |> 
  filter(diff_med == 0) |> 
  pivot_wider(
    id_cols = c(pval, diff_med), 
    names_from = mid_base, values_from = c(n, percent)
  )
## # A tibble: 4 × 10
##    pval diff_med   n_A   n_C   n_G   n_T percent_A percent_C percent_G percent_T
##   <dbl>    <dbl> <int> <int> <int> <int>     <dbl>     <dbl>     <dbl>     <dbl>
## 1 0.001        0    26   159     2    19      12.6      77.2     0.971      9.22
## 2 0.01         0    48   295     2    26      12.9      79.5     0.539      7.01
## 3 0.05         0    68   489     8    40      11.2      80.8     1.32       6.61
## 4 0.1          0    92   605    10    61      12.0      78.8     1.30       7.94

Export fasta

export_increased_intensity_sites_as_fasta <- function(pval_thresh) {
  
  fasta_basename <- paste0('sites_with_increased_current_intensity_pval_', pval_thresh)
  fasta_dir <- 'Fasta/DRS_diffthresh/'
  
  filter_KS_intensity_increase_pval_diff_median(pval = pval_thresh, diff_med = 0) |> 
    mutate(name = paste0(transcript_id, '|', position)) |> 
    select(name, ref_kmer) |> 
    export_as_fasta(
      name = name, sequence = ref_kmer, 
      outdir = fasta_dir, fasta_basename = fasta_basename, compression = ''
    )
}

pval = c(.001, .01, .05, .1) |> 
  walk(export_increased_intensity_sites_as_fasta)
## 
## Exported to: Fasta/DRS_diffthresh/sites_with_increased_current_intensity_pval_0.001.fa
## 
## 
## Exported to: Fasta/DRS_diffthresh/sites_with_increased_current_intensity_pval_0.01.fa
## 
## 
## Exported to: Fasta/DRS_diffthresh/sites_with_increased_current_intensity_pval_0.05.fa
## 
## 
## Exported to: Fasta/DRS_diffthresh/sites_with_increased_current_intensity_pval_0.1.fa